Concrete is the most important material in civil engineering. The concrete compressive strength is a highly nonlinear function of age and ingredients. These ingredients include cement, blast furnace slag, fly ash, water, superplasticizer, coarse aggregate, and fine aggregate.
To predict the concrete strength using the data available in file concrete_data.xls. Apply feature engineering and model tuning to obtain 80% to 95% of R2score.
Concrete_Data.xls - Available from:
https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/
concrete_readme.txt - Available from:
https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Readme.txt
Name -- Data Type -- Measurement -- Description
# don't show warning messages
import warnings
warnings.filterwarnings("ignore")
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas_profiling
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, cross_validate, cross_val_predict
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# Adjust display and formatting settings
plt.style.use('ggplot')
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
# import dataset (using provided .csv version of data)
df = pd.read_csv('concrete.csv')
# quick look at the first few rows
df.head()
# shape of dataset
df.shape
# data types and info
df.info()
# description of attributes for numerical data types / statistical summary
df.describe().transpose()
# visualize numerical data distributions with boxplots
for i, col in enumerate(df.columns):
plt.figure(i, figsize=(20,2))
sns.boxplot(x=df[col])
plt.show()
# We will use a Pandas Profiling library to provide a more detailed report of the data
#profile = df.profile_report(style={'full_width':True}) #syntax for older version of pandas_profiling package
profile = df.profile_report(html={'style':{'full_width':True}}) #syntax for v2.6 version of pandas_profiling that works with pandas v1.0+
profile
# plot predictor variables and target column (increase size of plots)
sns.pairplot(df, diag_kind= 'kde', hue = 'strength', height=4)
plt.show();
# plot all columns (increase size of plots)
sns.pairplot(df, diag_kind= 'kde', height=4)
plt.show();
# check correlation in table format
df.corr()
# Split data into features and target data
X = df.drop(columns = 'strength').copy()
y = df['strength'].copy()
# Split data into train and test data
# **Note: we will use the training set for initial model evaluation and tuning and test final model on our test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
# Examine for higher degree attributes
rmses = []
degrees = np.arange(1, 10)
min_rmse, min_deg = 1e10, 0
for deg in degrees:
# Train features
poly_features = PolynomialFeatures(degree=deg, include_bias=False)
x_poly_train = poly_features.fit_transform(X_train)
# Linear regression
poly_reg = LinearRegression()
poly_reg.fit(x_poly_train, y_train)
# Compare with test data
x_poly_test = poly_features.fit_transform(X_test)
poly_predict = poly_reg.predict(x_poly_test)
poly_mse = mean_squared_error(y_test, poly_predict)
poly_rmse = np.sqrt(poly_mse)
rmses.append(poly_rmse)
# Cross-validation of degree
if min_rmse > poly_rmse:
min_rmse = poly_rmse
min_deg = deg
# Plot and present results
print('Best degree {} with RMSE {}'.format(min_deg, min_rmse))
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(degrees, rmses)
ax.set_yscale('log')
ax.set_xlabel('Degree')
ax.set_ylabel('RMSE')
# cross-validation initialization
kfold = KFold(n_splits=10, random_state=1)
# Model without scaling
model = BaggingRegressor(random_state=1)
# Model with data scaling
#model = make_pipeline(StandardScaler(), BaggingRegressor(random_state=1))
# Evaluate model on train data
score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='r2')
R2_train = score.mean()
# print the score
print("R2_Train: %.3f" % (R2_train))
# Tune model with hyper-parameters
# parameter grid
param_grid = { 'base_estimator': (DummyRegressor(), DecisionTreeRegressor(), KNeighborsRegressor(), SVR(gamma='scale')),
'n_estimators': (10, 50, 100),
'bootstrap': (True, False)
}
# use GridSearchCV for full-grid iteration
gs = GridSearchCV(model, param_grid, cv=kfold)
gs.fit(X_train, y_train)
# score the model
R2_train = gs.score(X_train, y_train)
R2_test = gs.score(X_test, y_test)
# print the score
print("R2_Train: %.3f | R2_Test: %.3f " % (R2_train, R2_test))
# print the best parameters
print()
print("BestParameters:")
print(gs.best_params_)
# record the score
model_scores = {'Model': ['BaggingRegressor'],
'R2_Train': R2_train,
'R2_Test': R2_test
}
# Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.DataFrame(model_scores)
resultsDf = resultsDf[['Model', 'R2_Train', 'R2_Test']]
resultsDf
# Model without scaling
model = RandomForestRegressor(random_state=1)
# Model with data scaling
#model = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=1))
# Evaluate model on train data
score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='r2')
R2_train = score.mean()
# print the score
print("R2_Train: %.3f" % (R2_train))
# Tune model with hyper-parameters
# parameter grid
param_grid = {'n_estimators': (10, 50, 100),
'criterion': ('mse', 'mae'),
'max_features':('auto', 'sqrt', 'log2'),
'bootstrap': (True, False)}
# use GridSearchCV for full-grid iteration
gs = GridSearchCV(model, param_grid, cv=kfold)
gs.fit(X_train, y_train)
# score the model
R2_train = gs.score(X_train, y_train)
R2_test = gs.score(X_test, y_test)
# print the score
print("R2_Train: %.3f | R2_Test: %.3f " % (R2_train, R2_test))
# print the best parameters
print()
print("BestParameters:")
print(gs.best_params_)
# record the score
resultsDf.loc[1] = ['RandomForestRegressor', R2_train, R2_test]
resultsDf
# Model without scaling
model = AdaBoostRegressor(random_state=1)
# Model with data scaling
#model = make_pipeline(StandardScaler(), AdaBoostRegressor(random_state=1))
# Evaluate model on train data
score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='r2')
R2_train = score.mean()
# print the score
print("R2_Train: %.3f" % (R2_train))
# Tune model with hyper-parameters
# parameter grid
param_grid = { 'base_estimator': (DummyRegressor(), DecisionTreeRegressor(), KNeighborsRegressor(), SVR(gamma='scale')),
'n_estimators': (10, 50, 100),
'loss': ('linear', 'square', 'exponential')
}
# use GridSearchCV for full-grid iteration
gs = GridSearchCV(model, param_grid, cv=kfold)
gs.fit(X_train, y_train)
# score the model
R2_train = gs.score(X_train, y_train)
R2_test = gs.score(X_test, y_test)
# print the score
print("R2_Train: %.3f | R2_Test: %.3f " % (R2_train, R2_test))
# print the best parameters
print()
print("BestParameters:")
print(gs.best_params_)
# record the score
resultsDf.loc[2] = ['AdaBoostRegressor', R2_train, R2_test]
resultsDf
# Model without scaling
model = GradientBoostingRegressor(random_state=1)
# Model with data scaling
#model = make_pipeline(StandardScaler(), GradientBoostingRegressor(random_state=1))
# Evaluate model on train data
score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='r2')
R2_train = score.mean()
# print the score
print("R2_Train: %.3f" % (R2_train))
# Tune model with hyper-parameters
# parameter grid
param_grid = {'loss': ('ls', 'lad', 'huber' , 'quantile'),
'n_estimators': (10, 50, 100),
'criterion': ('friedman_mse', 'mse', 'mae'),
'max_features':('auto', 'sqrt', 'log2'),
}
# use GridSearchCV for full-grid iteration
gs = GridSearchCV(model, param_grid, cv=kfold)
gs.fit(X_train, y_train)
# score the model
R2_train = gs.score(X_train, y_train)
R2_test = gs.score(X_test, y_test)
# print the score
print("R2_Train: %.3f | R2_Test: %.3f " % (R2_train, R2_test))
# print the best parameters
print()
print("BestParameters:")
print(gs.best_params_)
# record the score
resultsDf.loc[3] = ['GradientBoostRegressor', R2_train, R2_test]
resultsDf
resultsDf